Crawling KBO

  • Website : http://www.koreabaseball.com
  • Object : 2015 season / all 10 teams in KBO League
  • Player game stats

    • hitter
    • pitcher
    • defense
    • runner
  • Player basic info

    • salary and other basic information
import packages

In [1]:
import pandas as pd
import time
from selenium import webdriver
import glob
import os
import csv
make api delay term

In [2]:
api_delay_term = 5

Crawling Players' stats

Hitter


In [47]:
# crawling_hitter_basic_stats
def crawling_hitter_basic(season_id, team_id):
    """
    season_id = 0 ~ 34
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 1982 ~ 34 : 2016
    
    <team_id> ==> It can be different from several season.
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    
    # connect url
    driver = webdriver.PhantomJS()
    url = "http://www.koreabaseball.com/Record/Player/HitterBasic/Basic1.aspx"
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2

    # make empty dataframe
    hitter_basic_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'avg', 'g', 'pa', 'ab', 'r', 'h', '2b',
        '3b', 'hr', 'tb', 'rbi', 'sac', 'sf'
    ])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]

        for element in elements:
            tmp_dict  = {
                'rank' : element.find_elements_by_css_selector('td')[0].text,
                'name' : element.find_elements_by_css_selector('td')[1].text,
                'team' : element.find_elements_by_css_selector('td')[2].text,
                'avg' : element.find_elements_by_css_selector('td')[3].text,
                'g' : element.find_elements_by_css_selector('td')[4].text,
                'pa' : element.find_elements_by_css_selector('td')[5].text,
                'ab' : element.find_elements_by_css_selector('td')[6].text,
                'r' : element.find_elements_by_css_selector('td')[7].text,
                'h' : element.find_elements_by_css_selector('td')[8].text,
                '2b' : element.find_elements_by_css_selector('td')[9].text,
                '3b' : element.find_elements_by_css_selector('td')[10].text,
                'hr' : element.find_elements_by_css_selector('td')[11].text,
                'tb' : element.find_elements_by_css_selector('td')[12].text,
                'rbi' : element.find_elements_by_css_selector('td')[13].text,
                'sac' : element.find_elements_by_css_selector('td')[14].text,
                'sf' : element.find_elements_by_css_selector('td')[15].text,
            }
            hitter_basic_df.loc[len(hitter_basic_df)] = tmp_dict
    
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict  = {
                    'rank' : element.find_elements_by_css_selector('td')[0].text,
                    'name' : element.find_elements_by_css_selector('td')[1].text,
                    'team' : element.find_elements_by_css_selector('td')[2].text,
                    'avg' : element.find_elements_by_css_selector('td')[3].text,
                    'g' : element.find_elements_by_css_selector('td')[4].text,
                    'pa' : element.find_elements_by_css_selector('td')[5].text,
                    'ab' : element.find_elements_by_css_selector('td')[6].text,
                    'r' : element.find_elements_by_css_selector('td')[7].text,
                    'h' : element.find_elements_by_css_selector('td')[8].text,
                    '2b' : element.find_elements_by_css_selector('td')[9].text,
                    '3b' : element.find_elements_by_css_selector('td')[10].text,
                    'hr' : element.find_elements_by_css_selector('td')[11].text,
                    'tb' : element.find_elements_by_css_selector('td')[12].text,
                    'rbi' : element.find_elements_by_css_selector('td')[13].text,
                    'sac' : element.find_elements_by_css_selector('td')[14].text,
                    'sf' : element.find_elements_by_css_selector('td')[15].text,
                }
                hitter_basic_df.loc[len(hitter_basic_df)] = tmp_dict
                
    return hitter_basic_df

In [48]:
# crawling_hitter_detail_stats
def crawling_hitter_detail(season_id, team_id):
    """
    season_id = 0 ~ 34
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 1982 ~ 34 : 2016
    
    <team_id> ==> It can be different from several season.
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    driver = webdriver.PhantomJS()
    url = "http://www.koreabaseball.com/Record/Player/HitterBasic/Detail1.aspx"
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2
        
    # make empty dataframe
    hitter_detail_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'avg', 'xbh', 'go', 'ao', 'go/ao', 'gw rbi',
        'bb/k', 'p/pa', 'isop', 'xr', 'gpa'
    ])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]
    
        for element in elements:
            tmp_dict  = {
                'rank' : element.find_elements_by_css_selector('td')[0].text,
                'name' : element.find_elements_by_css_selector('td')[1].text,
                'team' : element.find_elements_by_css_selector('td')[2].text,
                'avg' : element.find_elements_by_css_selector('td')[3].text,
                'xbh' : element.find_elements_by_css_selector('td')[4].text,
                'go' : element.find_elements_by_css_selector('td')[5].text,
                'ao' : element.find_elements_by_css_selector('td')[6].text,
                'go/ao' : element.find_elements_by_css_selector('td')[7].text,
                'gw rbi' : element.find_elements_by_css_selector('td')[8].text,
                'bb/k' : element.find_elements_by_css_selector('td')[9].text,
                'p/pa' : element.find_elements_by_css_selector('td')[10].text,
                'isop' : element.find_elements_by_css_selector('td')[11].text,
                'xr' : element.find_elements_by_css_selector('td')[12].text,
                'gpa' : element.find_elements_by_css_selector('td')[13].text,
            }
            hitter_detail_df.loc[len(hitter_detail_df)] = tmp_dict
        
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict  = {
                    'rank' : element.find_elements_by_css_selector('td')[0].text,
                    'name' : element.find_elements_by_css_selector('td')[1].text,
                    'team' : element.find_elements_by_css_selector('td')[2].text,
                    'avg' : element.find_elements_by_css_selector('td')[3].text,
                    'xbh' : element.find_elements_by_css_selector('td')[4].text,
                    'go' : element.find_elements_by_css_selector('td')[5].text,
                    'ao' : element.find_elements_by_css_selector('td')[6].text,
                    'go/ao' : element.find_elements_by_css_selector('td')[7].text,
                    'gw rbi' : element.find_elements_by_css_selector('td')[8].text,
                    'bb/k' : element.find_elements_by_css_selector('td')[9].text,
                    'p/pa' : element.find_elements_by_css_selector('td')[10].text,
                    'isop' : element.find_elements_by_css_selector('td')[11].text,
                    'xr' : element.find_elements_by_css_selector('td')[12].text,
                    'gpa' : element.find_elements_by_css_selector('td')[13].text,
                }
                hitter_detail_df.loc[len(hitter_detail_df)] = tmp_dict
                
    return hitter_detail_df

Pitcher


In [49]:
# crawling_pitcher_basic
def crawling_pitcher_basic(season_id, team_id):
    """
    season_id = 0 ~ 34
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 1982 ~ 34 : 2016
    
    <team_id> ==> It can be different from several season.
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    driver = webdriver.PhantomJS()
    url = "http://www.koreabaseball.com/Record/Player/PitcherBasic/Basic1.aspx"
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2
        
    # make empty dataframe
    pitcher_basic_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'ERA', 'G', 'W', 'L', 'SV', 'HLD', 'WPCT',
        'IP', 'H', 'HR', 'BB', 'HBP', 'SO', 'R', 'ER', 'WHIP'
    ])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]
        
        for element in elements:
            tmp_dict  = {
                'rank' : element.find_elements_by_css_selector('td')[0].text,
                'name' : element.find_elements_by_css_selector('td')[1].text,
                'team' : element.find_elements_by_css_selector('td')[2].text,
                'ERA' : element.find_elements_by_css_selector('td')[3].text,
                'G' : element.find_elements_by_css_selector('td')[4].text,
                'W' : element.find_elements_by_css_selector('td')[5].text,
                'L' : element.find_elements_by_css_selector('td')[6].text,
                'SV' : element.find_elements_by_css_selector('td')[7].text,
                'HLD' : element.find_elements_by_css_selector('td')[8].text,
                'WPCT' : element.find_elements_by_css_selector('td')[9].text,
                'IP' : element.find_elements_by_css_selector('td')[10].text,
                'H' : element.find_elements_by_css_selector('td')[11].text,
                'HR' : element.find_elements_by_css_selector('td')[12].text,
                'BB' : element.find_elements_by_css_selector('td')[13].text,
                'HBP' : element.find_elements_by_css_selector('td')[14].text,
                'SO' : element.find_elements_by_css_selector('td')[15].text,
                'R' : element.find_elements_by_css_selector('td')[16].text,
                'ER' : element.find_elements_by_css_selector('td')[17].text,
                'WHIP' : element.find_elements_by_css_selector('td')[18].text,
            }
            pitcher_basic_df.loc[len(pitcher_basic_df)] = tmp_dict
        
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict  = {
                    'rank' : element.find_elements_by_css_selector('td')[0].text,
                    'name' : element.find_elements_by_css_selector('td')[1].text,
                    'team' : element.find_elements_by_css_selector('td')[2].text,
                    'ERA' : element.find_elements_by_css_selector('td')[3].text,
                    'G' : element.find_elements_by_css_selector('td')[4].text,
                    'W' : element.find_elements_by_css_selector('td')[5].text,
                    'L' : element.find_elements_by_css_selector('td')[6].text,
                    'SV' : element.find_elements_by_css_selector('td')[7].text,
                    'HLD' : element.find_elements_by_css_selector('td')[8].text,
                    'WPCT' : element.find_elements_by_css_selector('td')[9].text,
                    'IP' : element.find_elements_by_css_selector('td')[10].text,
                    'H' : element.find_elements_by_css_selector('td')[11].text,
                    'HR' : element.find_elements_by_css_selector('td')[12].text,
                    'BB' : element.find_elements_by_css_selector('td')[13].text,
                    'HBP' : element.find_elements_by_css_selector('td')[14].text,
                    'SO' : element.find_elements_by_css_selector('td')[15].text,
                    'R' : element.find_elements_by_css_selector('td')[16].text,
                    'ER' : element.find_elements_by_css_selector('td')[17].text,
                    'WHIP' : element.find_elements_by_css_selector('td')[18].text,
                }
                pitcher_basic_df.loc[len(pitcher_basic_df)] = tmp_dict
        
    return pitcher_basic_df

In [38]:
# crawling_pitcher_detail
def crawling_pitcher_detail(season_id, team_id):
    """
    season_id = 0 ~ 34
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 1982 ~ 34 : 2016
    
    <team_id> ==> It can be different from several season.
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    driver = webdriver.PhantomJS()
    url = "http://www.koreabaseball.com/Record/Player/PitcherBasic/Detail1.aspx"
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2
        
    # make empty dataframe
    pitcher_detail_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'ERA', 'GS', 'Wgs', 'Wgr', 'GF', 'SVO', 'TS',
        'GDP', 'GO', 'AO', 'GO/AO'
    ])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]
        
        for element in elements:
            tmp_dict  = {
                'rank' : element.find_elements_by_css_selector('td')[0].text,
                'name' : element.find_elements_by_css_selector('td')[1].text,
                'team' : element.find_elements_by_css_selector('td')[2].text,
                'ERA' : element.find_elements_by_css_selector('td')[3].text,
                'GS' : element.find_elements_by_css_selector('td')[4].text,
                'Wgs' : element.find_elements_by_css_selector('td')[5].text,
                'Wgr' : element.find_elements_by_css_selector('td')[6].text,
                'GF' : element.find_elements_by_css_selector('td')[7].text,
                'SVO' : element.find_elements_by_css_selector('td')[8].text,
                'TS' : element.find_elements_by_css_selector('td')[9].text,
                'GDP' : element.find_elements_by_css_selector('td')[10].text,
                'GO' : element.find_elements_by_css_selector('td')[11].text,
                'AO' : element.find_elements_by_css_selector('td')[12].text,
                'GO/AO' : element.find_elements_by_css_selector('td')[13].text,
            }
            pitcher_detail_df.loc[len(pitcher_detail_df)] = tmp_dict
        
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict  = {
                    'rank' : element.find_elements_by_css_selector('td')[0].text,
                    'name' : element.find_elements_by_css_selector('td')[1].text,
                    'team' : element.find_elements_by_css_selector('td')[2].text,
                    'ERA' : element.find_elements_by_css_selector('td')[3].text,
                    'GS' : element.find_elements_by_css_selector('td')[4].text,
                    'Wgs' : element.find_elements_by_css_selector('td')[5].text,
                    'Wgr' : element.find_elements_by_css_selector('td')[6].text,
                    'GF' : element.find_elements_by_css_selector('td')[7].text,
                    'SVO' : element.find_elements_by_css_selector('td')[8].text,
                    'TS' : element.find_elements_by_css_selector('td')[9].text,
                    'GDP' : element.find_elements_by_css_selector('td')[10].text,
                    'GO' : element.find_elements_by_css_selector('td')[11].text,
                    'AO' : element.find_elements_by_css_selector('td')[12].text,
                    'GO/AO' : element.find_elements_by_css_selector('td')[13].text,
                }
                pitcher_detail_df.loc[len(pitcher_detail_df)] = tmp_dict
    return pitcher_detail_df

Defense


In [45]:
# crawling_defense
def crawling_defense(season_id, team_id):
    """
    season_id = 0 ~ 14
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 2002 ~ 14 : 2016
    
    <team_id> ==> It can be different from several season.
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    driver = webdriver.Firefox()
    url = "http://www.koreabaseball.com/Record/Player/Defense/Basic.aspx"
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2
        
    # make empty dataframe
    defense_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'POS', 'G', 'GS', 'IP', 'E', 'PKO', 'PO',
        'A', 'DP', 'FPCT', 'PB', 'SB', 'CS', 'CS%'
    ])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]
        
        for element in elements:
            tmp_dict  = {
                'rank' : element.find_elements_by_css_selector('td')[0].text,
                'name' : element.find_elements_by_css_selector('td')[1].text,
                'team' : element.find_elements_by_css_selector('td')[2].text,
                'POS' : element.find_elements_by_css_selector('td')[3].text,
                'G' : element.find_elements_by_css_selector('td')[4].text,
                'GS' : element.find_elements_by_css_selector('td')[5].text,
                'IP' : element.find_elements_by_css_selector('td')[6].text,
                'E' : element.find_elements_by_css_selector('td')[7].text,
                'PKO' : element.find_elements_by_css_selector('td')[8].text,
                'PO' : element.find_elements_by_css_selector('td')[9].text,
                'A' : element.find_elements_by_css_selector('td')[10].text,
                'DP' : element.find_elements_by_css_selector('td')[11].text,
                'FPCT' : element.find_elements_by_css_selector('td')[12].text,
                'PB' : element.find_elements_by_css_selector('td')[13].text,
                'SB' : element.find_elements_by_css_selector('td')[14].text,
                'CS' : element.find_elements_by_css_selector('td')[15].text,
                'CS%' : element.find_elements_by_css_selector('td')[16].text,
            }
            defense_df.loc[len(defense_df)] = tmp_dict
        
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict  = {
                    'rank' : element.find_elements_by_css_selector('td')[0].text,
                    'name' : element.find_elements_by_css_selector('td')[1].text,
                    'team' : element.find_elements_by_css_selector('td')[2].text,
                    'POS' : element.find_elements_by_css_selector('td')[3].text,
                    'G' : element.find_elements_by_css_selector('td')[4].text,
                    'GS' : element.find_elements_by_css_selector('td')[5].text,
                    'IP' : element.find_elements_by_css_selector('td')[6].text,
                    'E' : element.find_elements_by_css_selector('td')[7].text,
                    'PKO' : element.find_elements_by_css_selector('td')[8].text,
                    'PO' : element.find_elements_by_css_selector('td')[9].text,
                    'A' : element.find_elements_by_css_selector('td')[10].text,
                    'DP' : element.find_elements_by_css_selector('td')[11].text,
                    'FPCT' : element.find_elements_by_css_selector('td')[12].text,
                    'PB' : element.find_elements_by_css_selector('td')[13].text,
                    'SB' : element.find_elements_by_css_selector('td')[14].text,
                    'CS' : element.find_elements_by_css_selector('td')[15].text,
                    'CS%' : element.find_elements_by_css_selector('td')[16].text,
                }
                defense_df.loc[len(defense_df)] = tmp_dict

    return defense_df

In [46]:
crawling_defense(13, 1)


Out[46]:
rank name team POS G GS IP E PKO PO A DP FPCT PB SB CS CS%
0 1 김하성 넥센 유격수 140 139 1209 1/3 21 0 212 408 87 0.967 0 0 0 -
1 2 박병호 넥센 1루수 131 131 1124 1/3 12 0 1113 93 103 0.990 0 0 0 -
2 3 박동원 넥센 포수 126 122 1012 10 6 858 96 10 0.990 9 75 32 29.9
3 4 유한준 넥센 우익수 81 68 570 2/3 2 0 126 0 0 0.984 0 0 0 -
4 4 이택근 넥센 중견수 81 72 621 2/3 0 0 128 3 0 1.000 0 0 0 -
5 6 서건창 넥센 2루수 72 67 566 11 0 148 174 49 0.967 0 0 0 -
6 6 윤석민 넥센 3루수 72 64 560 1/3 11 0 49 110 17 0.935 0 0 0 -
7 8 유한준 넥센 중견수 70 61 530 0 0 122 4 0 1.000 0 0 0 -
8 8 조상우 넥센 투수 70 0 93 1/3 2 1 2 10 1 0.857 0 0 0 -
9 10 김민성 넥센 3루수 69 62 547 2/3 4 0 57 111 11 0.977 0 0 0 -
10 11 김지수 넥센 2루수 67 24 273 2/3 1 0 62 84 25 0.993 0 0 0 -
11 12 고종욱 넥센 좌익수 66 57 481 2/3 3 0 104 1 0 0.972 0 0 0 -
12 13 스나이더 넥센 우익수 60 53 470 1/3 1 0 81 3 2 0.988 0 0 0 -
13 14 김재현 넥센 포수 58 15 204 2/3 2 1 153 15 1 0.988 1 21 7 25.0
14 14 손승락 넥센 투수 58 0 61 1/3 1 0 5 16 0 0.955 0 0 0 -
15 16 김영민 넥센 투수 57 4 90 1/3 1 0 1 12 1 0.929 0 0 0 -
16 17 박헌도 넥센 좌익수 54 37 340 2/3 0 0 58 0 0 1.000 0 0 0 -
17 18 스나이더 넥센 좌익수 50 39 356 2/3 1 0 73 2 0 0.987 0 0 0 -
18 19 김대우 넥센 투수 47 1 71 0 0 6 5 0 1.000 0 0 0 -
19 20 한현희 넥센 투수 45 17 123 1/3 2 1 8 16 0 0.923 0 0 0 -
20 21 김택형 넥센 투수 37 10 58 1 0 5 8 0 0.929 0 0 0 -
21 22 김민성 넥센 2루수 36 32 250 1 0 68 98 25 0.994 0 0 0 -
22 23 마정길 넥센 투수 35 0 37 1/3 0 0 2 2 1 1.000 0 0 0 -
23 24 문성현 넥센 투수 34 13 91 2/3 0 0 8 6 0 1.000 0 0 0 -
24 25 서동욱 넥센 2루수 33 16 163 1/3 2 0 41 56 14 0.980 0 0 0 -
25 26 밴헤켄 넥센 투수 32 32 196 2/3 3 1 11 23 0 0.919 0 0 0 -
26 27 피어밴드 넥센 투수 30 30 177 1/3 4 13 1 44 1 0.918 0 0 0 -
27 28 문우람 넥센 우익수 26 12 147 0 0 28 2 0 1.000 0 0 0 -
28 29 이상민 넥센 투수 23 0 19 1/3 0 0 0 4 0 1.000 0 0 0 -
29 30 김동준 넥센 투수 22 5 50 1/3 1 1 4 8 0 0.923 0 0 0 -
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
46 47 금민철 넥센 투수 7 5 22 0 0 3 2 0 1.000 0 0 0 -
47 47 배힘찬 넥센 투수 7 0 6 2/3 0 0 1 0 0 1.000 0 0 0 -
48 47 임병욱 넥센 1루수 7 0 15 0 0 11 0 2 1.000 0 0 0 -
49 50 장영석 넥센 3루수 6 1 22 1 0 1 9 1 0.909 0 0 0 -
50 51 송성문 넥센 2루수 5 5 27 2 0 7 8 2 0.882 0 0 0 -
51 52 강지광 넥센 좌익수 4 3 23 1 0 5 0 0 0.833 0 0 0 -
52 52 스나이더 넥센 중견수 4 1 12 0 0 4 0 0 1.000 0 0 0 -
53 52 유재신 넥센 좌익수 4 0 6 2/3 0 0 0 0 0 - 0 0 0 -
54 52 임병욱 넥센 우익수 4 1 15 0 0 2 0 0 1.000 0 0 0 -
55 56 구자형 넥센 투수 3 0 2 2/3 0 0 0 1 0 1.000 0 0 0 -
56 56 박헌도 넥센 중견수 3 1 11 0 0 2 0 0 1.000 0 0 0 -
57 56 서동욱 넥센 3루수 3 1 10 0 0 0 0 0 - 0 0 0 -
58 56 임병욱 넥센 중견수 3 0 8 0 0 2 0 0 1.000 0 0 0 -
59 56 장시윤 넥센 유격수 3 2 19 1 0 4 7 0 0.917 0 0 0 -
60 61 김상수 넥센 투수 2 1 5 0 0 0 2 0 1.000 0 0 0 -
61 61 이정훈 넥센 투수 2 0 2 0 0 0 0 0 - 0 0 0 -
62 61 임병욱 넥센 좌익수 2 2 11 0 0 3 0 0 1.000 0 0 0 -
63 61 장시윤 넥센 2루수 2 0 4 0 0 1 3 1 1.000 0 0 0 -
64 65 강지광 넥센 우익수 1 0 1 0 0 0 0 0 - 0 0 0 -
65 65 고종욱 넥센 중견수 1 0 2 0 0 2 0 0 1.000 0 0 0 -
66 65 김민준 넥센 중견수 1 0 6 0 0 0 0 0 - 0 0 0 -
67 65 김정인 넥센 투수 1 0 1 0 0 0 0 0 - 0 0 0 -
68 65 박병호 넥센 3루수 1 1 8 0 0 1 0 0 1.000 0 0 0 -
69 65 서동욱 넥센 우익수 1 0 1 0 0 0 0 0 - 0 0 0 -
70 65 유재신 넥센 2루수 1 0 2 0 0 2 0 0 1.000 0 0 0 -
71 65 유재신 넥센 3루수 1 0 1 0 0 0 0 0 - 0 0 0 -
72 65 윤석민 넥센 유격수 1 1 4 1 0 1 0 0 0.500 0 0 0 -
73 65 정회찬 넥센 투수 1 0 1 0 0 0 0 0 - 0 0 0 -
74 65 허정협 넥센 우익수 1 1 6 0 0 3 0 0 1.000 0 0 0 -
75 65 홍성갑 넥센 좌익수 1 0 1 0 0 0 0 0 - 0 0 0 -

76 rows × 17 columns

Runner


In [50]:
# crawling_runner
def crawling_runner(season_id, team_id):
    """
    season_id = 0 ~ 34
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 2002 ~ 14 : 2016
    
    <team_id> ==> It can be different from several season.
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    driver = webdriver.PhantomJS()
    url = "http://www.koreabaseball.com/Record/Player/Runner/Basic.aspx"
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2
        
    # make empty dataframe
    runner_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'G', 'SBA', 'SB', 'CS', 'SB%', 'OOB',
        'PKO'
    ])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]
        
        for element in elements:
            tmp_dict  = {
                'rank' : element.find_elements_by_css_selector('td')[0].text,
                'name' : element.find_elements_by_css_selector('td')[1].text,
                'team' : element.find_elements_by_css_selector('td')[2].text,
                'G' : element.find_elements_by_css_selector('td')[3].text,
                'SBA' : element.find_elements_by_css_selector('td')[4].text,
                'SB' : element.find_elements_by_css_selector('td')[5].text,
                'CS' : element.find_elements_by_css_selector('td')[6].text,
                'SB%' : element.find_elements_by_css_selector('td')[7].text,
                'OOB' : element.find_elements_by_css_selector('td')[8].text,
                'PKO' : element.find_elements_by_css_selector('td')[9].text,
            }
            runner_df.loc[len(runner_df)] = tmp_dict
        
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict  = {
                    'rank' : element.find_elements_by_css_selector('td')[0].text,
                    'name' : element.find_elements_by_css_selector('td')[1].text,
                    'team' : element.find_elements_by_css_selector('td')[2].text,
                    'G' : element.find_elements_by_css_selector('td')[3].text,
                    'SBA' : element.find_elements_by_css_selector('td')[4].text,
                    'SB' : element.find_elements_by_css_selector('td')[5].text,
                    'CS' : element.find_elements_by_css_selector('td')[6].text,
                    'SB%' : element.find_elements_by_css_selector('td')[7].text,
                    'OOB' : element.find_elements_by_css_selector('td')[8].text,
                    'PKO' : element.find_elements_by_css_selector('td')[9].text,
                }
                runner_df.loc[len(runner_df)] = tmp_dict

    return runner_df

In [ ]: